# -*- coding: utf-8 -*-
"""
Created on Thu Aug  1 14:10:19 2019

@author: rklotz

Updated 4/20/2020
"""

import numpy as np
import pandas as pd
import os

#import datetime 
import pytz

pacific_tz = pytz.timezone("US/Pacific")

# tracks file (this includes all tracks)
tracks_main_csv = r"H:\My Drive\Boats\ReplicationCode\data\final\CA_tracks.csv"
tracks_other_csv = r"H:\My Drive\Boats\ReplicationCode\data\final\CA_tracks_other.csv"


# EC data after being cleaned in stata
EC_dta = r"H:\My Drive\Boats\ReplicationCode\data\entrance_clearance\ec0716_clean.dta"

# more aggregate region definitions
agg_region = r"H:\My Drive\Boats\ReplicationCode\data\ECport_agg.xls"

# port names xlsx file
ports_decs = r"H:\My Drive\Boats\ReplicationCode\data\port_decs_wAK_HI.xls"

out_file = r'H:\My Drive\Boats\ReplicationCode\results\sum_stats\classif_%s_ent_v2a.tex'


# make folder for output
if not os.path.exists(os.path.split(out_file)[0]):
    os.makedirs(os.path.split(out_file)[0])   

# load ports file
ports_df = pd.read_excel(ports_decs)

# load region file
region_df = pd.read_excel(agg_region)

# load entrance/clearance
print("Loading E/C")
df_EC = pd.read_stata(EC_dta)
df_EC['clearance'] = df_EC['typedoc']
mdy = df_EC[['year','month','day']].astype('int32')
df_EC['date'] = pd.to_datetime(mdy).dt.tz_localize(pacific_tz)
df_EC['fromEC'] = 1

# dropping 2007-2008
df_EC = df_EC[df_EC.year>=2009]

# dropping missing IMO
df_EC = df_EC.loc[~df_EC.imo_upd.isnull()] # dropping missing IMO

# adding broad where_port aggregation
df_EC = df_EC.merge(region_df,left_on='where_port_agg2',right_on='where_port_agg_orig',how='inner')
        
# load tracks
print("Loading Tracks")
cols = ['TIME1','TIME2','PORT1','PORT2','RIGHT1','RIGHT2','AISyear','MMSI','IMO'
        ,'port_name1','port_agg1','port_name2','port_agg2','dirNW','INTERP_FLAG','VesselType','group_agg','Length','Width']
df_tracks = pd.read_csv(tracks_main_csv,low_memory=False,usecols=cols)
df_tracks_other = pd.read_csv(tracks_other_csv,low_memory=False,usecols=cols)

df_tracks = pd.concat([df_tracks,df_tracks_other],axis=0,ignore_index=True,sort=True)
del df_tracks_other

# converting to pacific time (need something else for AK/HI)
df_tracks['TIME1'] = pd.to_datetime( df_tracks['TIME1'] ).dt.tz_localize('UTC').dt.tz_convert(pacific_tz)
df_tracks['TIME2'] = pd.to_datetime( df_tracks['TIME2'] ).dt.tz_localize('UTC').dt.tz_convert(pacific_tz)

df_tracks['day1'] = df_tracks.TIME1.dt.day
df_tracks['day2'] = df_tracks.TIME2.dt.day

df_tracks['month1'] = df_tracks.TIME1.dt.month
df_tracks['month2'] = df_tracks.TIME2.dt.month

#df_tracks['IMO'] = df_tracks['IMO'].str.replace('IMO','').astype('int64')
df_tracks['sample_ind'] = ~ ( df_tracks.IMO.isna() | df_tracks.INTERP_FLAG==1 ) # dropping missing IMO

df_tracks['fromAIS'] = 1




#t2 = pd.to_datetime( df_tracks['TIME2'] )
#t2_pst = t2.dt.tz_localize('UTC').dt.tz_convert(pacific_tz)
#df_tracks['TIME2'] = pd.to_datetime(df_tracks['TIME2']).tz_localize(pacific_tz)

###############################
# getting portland entrances
#df_tracks_port = df_tracks.loc[(df_tracks.PORT2==6) & (df_tracks.PORT1!=6) & (df_tracks.RIGHT2==0),:]
#df_EC_port = df_EC.loc[(df_EC.port_agg=="Port") & (df_EC.clearance==0) , :] # portland and other columbia river ports

# getting SF entrances
#df_tracks_port = df_tracks.loc[(df_tracks.PORT2==1) & (df_tracks.PORT1!=1) & (df_tracks.RIGHT2==0),:]
#df_EC_port = df_EC.loc[(df_EC.ECport_agg=="SFBay") & (df_EC.where_port_agg!="SFBay") & (df_EC.clearance==0) , : ] # sf bay entrances

# getting entrances (any port)
for port in ["SFBay","LALB"] :
    
    df_tracks_port = df_tracks.loc[(df_tracks.port_name2==port) & (df_tracks.port_name1!=port) & (df_tracks.RIGHT2==0) & df_tracks.sample_ind,:]
    df_EC_port = df_EC.loc[(df_EC.ECport_agg==port) & (df_EC.where_port_agg!=port) & (df_EC.clearance==0) , : ] # sf bay entrances
    ##############################
    
    df_EC_port = df_EC_port.sort_values(by=['date'])
    
    # removing duplicate entrances (same IMO entering same port area on same day (ie visiting to separate minor ports))
    print(len(df_EC_port))
    df_EC_port.drop_duplicates(subset=['imo_upd','date'],keep='first',inplace=True)
    print(len(df_EC_port))
    
    # merging those for portland
    #df_port = pd.merge(df_tracks_port,df_EC_port,left_on=['IMO','day2','month2','AIS_year'],right_on=['imo_upd','day','month','year'],how='outer')
    
    # merging with date buffer (merge on imo, then keep only if date gap less than 1 day )
    print('Merging')
    df_port = pd.merge(df_tracks_port,df_EC_port,left_on=['IMO'],right_on=['imo_upd'],how='outer')
    df_port['diff_hours'] = ( df_port.date - df_port.TIME2 ).dt.total_seconds() / 3600
    df_port['match'] = ( abs(df_port['diff_hours'])<=24 )
    
    # count matches per track
    df_tmp = df_port.groupby(['IMO','TIME2'],as_index=False).match.sum()
    df_port = pd.merge(df_port,df_tmp,on=['IMO','TIME2'],suffixes=('','_tot'),how='left')
    del df_tmp
    
    EC_vars = df_EC_port.columns # getting column labels
    tracks_vars = df_tracks_port.columns 
    
    df_port['merged'] = np.where( (df_port['fromEC']==1) & (df_port['fromAIS']==1) , 1 , 0 )
    df_port['EConly'] = np.where( (df_port['fromEC']==1) & (df_port['fromAIS'].isnull()) , 1 , 0 )
    df_port['AISonly'] = np.where( (df_port['fromAIS']==1) & (df_port['fromEC'].isnull()) , 1 , 0 )
    
    # here there are three groups
    ## merged and matched - tot_match>0
        # if 1 then keep
    ## merged without a match - tot_match==0
    ## not merged  - keep
    
    #df_nomerge = df_port.loc[df_port.merged==0]     # no imo match
    #df_matched = df_port.loc[(df_port.match)]       # those that matched  
    #df_nomatch = df_port.loc[df_port.match_tot==0].drop_duplicates(subset=['IMO','TIME2']).drop(EC_vars,axis=1)
    #
    #print(len(df_nomerge))
    #print(len(df_matched))
    #print(len(df_nomatch))
    
    df_port_clean = pd.concat( [ df_port.loc[df_port.merged==0]  ,
                                 df_port.loc[(df_port.match)]  ,
                                 df_port.loc[df_port.match_tot==0].drop_duplicates(subset=['IMO','TIME2']).drop(EC_vars,axis=1) ]   , sort = True    ) 
    del df_port
    del df_tracks_port
    del df_EC_port
    #del df_EC
    #del df_tracks
    
    print(df_port_clean.groupby(['merged','EConly','AISonly']).merged.count())
    
    
    df_port_clean = df_port_clean[['MMSI','IMO','PORT1','PORT2','VesselType','vessel_str','Length','Width','port_agg1','port_agg2',
                       'imo_upd','where_port', 'where_schedk',  'where_ind', 'where_name','icst','ECport_agg','ECport_agg2',
                       'TIME2','fromEC','fromAIS','port_name1','port_name2','where_port_agg','where_port_agg2','where_port_agg_broad',
                       'date','diff_hours','merged','EConly','AISonly']]
    
    
    
    df_port_clean['IMO'] = np.where( df_port_clean['IMO'].isnull() , df_port_clean['imo_upd']  , df_port_clean['IMO'] )
    df_port_clean['date_m'] = np.where( df_port_clean['date'].isnull() , df_port_clean['TIME2']  , df_port_clean['date'] )
    df_port_clean = df_port_clean.drop(['date'],axis=1)
    df_port_clean['year'] = df_port_clean.date_m.dt.year
    
    # keeping only those years with imo numbers in AIS
    df_port_clean = df_port_clean.loc[df_port_clean.year.isin([2009,2015,2016])]
    
    #rename_map = {"destination": "origin" }
    #df_port_clean = df_port_clean.rename( columns=rename_map )
    
    # replace nan
    #df_port_clean.origin = df_port_clean.origin.replace(np.nan, 'No E/C', regex=True)
    #df_port_clean.origin = df_port_clean.origin.replace('', 'Need to classify', regex=True)
    
    port_df_clean = ports_df[['port_agg','port_broad']].dropna().drop_duplicates()
    
    df_port_clean = df_port_clean.merge(port_df_clean,left_on='port_agg1',right_on='port_agg',how='left')
    df_port_clean = df_port_clean.rename(columns={'port_broad':'port_broad1'})
    df_port_clean = df_port_clean.merge(port_df_clean,left_on='port_agg2',right_on='port_agg',how='left')
    df_port_clean = df_port_clean.rename(columns={'port_broad':'port_broad2'})
    
    vs = ['port_name1','port_name2','port_agg1','port_agg2','port_broad1','port_broad2']
    for v in vs :
        df_port_clean[v] = np.where( ( df_port_clean['EConly']==1 ) & (df_port_clean[v].isnull() ) , "y_NoAIS" , df_port_clean[v] )
        df_port_clean[v] = np.where( ( df_port_clean[v].isnull() )                                 , "y_N/A" , df_port_clean[v] )
    
    df_port_clean.port_name1 = df_port_clean.port_name1.replace(np.nan, 'N/A', regex=True)
    df_port_clean.port_name2 = df_port_clean.port_name2.replace(np.nan, 'N/A', regex=True)
    df_port_clean.port_agg1 = df_port_clean.port_agg1.replace(np.nan, 'N/A', regex=True)
    df_port_clean.port_agg2 = df_port_clean.port_agg2.replace(np.nan, 'N/A', regex=True)
    
    vs = ['where_port_agg2','where_port_agg','where_port_agg_broad']
    for v in vs :
        df_port_clean[v] = np.where( df_port_clean.AISonly==1 , 'AISonly'  , df_port_clean[v] ) 
    
    print(df_port_clean.groupby(['merged','EConly','AISonly']).merged.count())
    
    
    print(df_port_clean.groupby(['AISonly','vessel_str']).IMO.count())
    print(df_port_clean.groupby(['AISonly']).IMO.count())
    # this shows that it is tanker activity that is most likely to be missing in E/C data
    # of tracks that are in AIS but not E/C
    # 686 of 982 are tankers, even though tankers are a much smaller number of total tracks
    
    
    # fraction merged by year
    print(df_port_clean.groupby(['year']).merged.mean())
    print(df_port_clean.groupby(['year']).EConly.mean())
    print(df_port_clean.groupby(['year']).AISonly.mean())
    
    orig_var = 'where_port_agg_broad' # 'where_port_agg2'
    
    # count correct matches
    # only works for port to port 
    df_port_clean['Correct'] = np.where( df_port_clean['where_port_agg2']==df_port_clean['port_agg1']  , "Correct" , np.nan )
    df_correct = df_port_clean.groupby([orig_var,'Correct'] , as_index=False ).IMO.count()
    df_correct = df_correct[df_correct.Correct=='Correct'] # dropping those not correct
    df_correct = df_correct.drop(['Correct'],axis=1)
    df_correct = df_correct.rename(columns={'IMO':'x_Correct'})
    
    
    # print table of origins in EC vs AIS
    # df for sorting Y
    #df_org = pd.DataFrame.from_dict({"sort":{'US_EC':1,'Asia':2,'Af':2,'MidE':2,'NAm':2,'SAm':2,'CAm':2,'Eur':2,'Oce':2,'N/A':3,'AISonly':4}})
    df_org = pd.DataFrame.from_dict({"sort":{'Other CA':1,'Other WC':2,'HI/AK':3,'N. Am':4,'Asia':5,'ROW':6,'AISonly':7}})
    df_org['where_port_agg_broad'] = df_org.index
    
    
    # counts, by origin EC by AIS
    #orig_var = 'where_port_agg_broad' # 'where_port_agg2'
    print(df_port_clean.groupby([orig_var]).IMO.count())
    df_orig_port = df_port_clean.groupby([orig_var,'port_broad1'],as_index=False).IMO.count()
    #df_orig_port = df_orig_port.rename(columns={'IMO':'z_Total'})
    
    df_orig_port = df_orig_port.pivot(index=orig_var, columns='port_broad1', values='IMO') # pivot
    df_orig_port['z_Total'] = df_orig_port.sum(axis=1,skipna=True) # calculate total
    #df_tmp = df_port_clean.loc[df_port_clean.origin==""]
    
    df_orig_port = df_orig_port.reset_index().merge(df_correct, how="left").set_index(orig_var) # add number correct
    
    df_orig_port = df_orig_port.reset_index().merge(df_org, how="left").set_index(orig_var) # add sorting indices
    df_orig_port.sort = df_orig_port.sort.fillna(0)
    
    #df_orig_port = df_orig_port.sort_index()
    df_orig_port['tmp_ind'] = df_orig_port.index
    df_orig_port = df_orig_port.sort_values(by=['sort','tmp_ind'])
    df_orig_port.drop(['sort','tmp_ind'],axis=1,inplace=True)
    
    df_orig_port = df_orig_port.reindex(sorted(df_orig_port.columns), axis=1) # sort columns
    df_orig_port.columns = df_orig_port.columns.str.replace('x_','')
    df_orig_port.columns = df_orig_port.columns.str.replace('y_','')
    df_orig_port.columns = df_orig_port.columns.str.replace('z_','')
    
    df_orig_port.index.name = None
    
    with open( out_file % port , 'w') as tf:
         tf.write(df_orig_port.to_latex(na_rep="",float_format="%.0f"))
    
    
    df_orig_port["N/A"] / df_orig_port["Total"]

